From 7570401dfac55eb779e4482a7fe8f4e9e17b23d7 Mon Sep 17 00:00:00 2001 From: "cl349@firebug.cl.cam.ac.uk" Date: Tue, 3 May 2005 14:41:52 +0000 Subject: [PATCH] bitkeeper revision 1.1389.4.1 (42778db0fg1Gd_OZr6N8_onX23xy3g) Patch to allow vcpu hotplugging in domU Signed-off-by: Ryan Harper Signed-off-by: Christian Limpach From: Ryan Harper Two config changes are required to use this change: 1) CONFIG_SMP=y 2) CONFIG_HOTPLUG_CPU=y I've tested unplugging/plugging cpus in domU via the sysfs interface that the patch provides. hungerforce:~# grep processor /proc/cpuinfo processor : 0 processor : 1 processor : 2 processor : 3 hungerforce:~# cd /sys/devices/system/cpu/ hungerforce:/sys/devices/system/cpu# echo 0 > cpu3/online hungerforce:/sys/devices/system/cpu# grep processor /proc/cpuinfo processor : 0 processor : 1 processor : 2 hungerforce:/sys/devices/system/cpu# echo 1 > cpu3/online hungerforce:/sys/devices/system/cpu# grep processor /proc/cpuinfo processor : 0 processor : 1 processor : 2 processor : 3 It seems that all processors besides cpu0 can be removed. I've not done any investigation nor optimization of the hotplug patch. I attempted to unplug cpus in dom0, but this resulted in Xen rebooting. Makefile, Kconfig, smpboot.c, smp.c, process.c, irq.c: Fix hotplug cpu support. Makefile: Setup reach-over build of topology.o. i386-cpu-hotplug-updated-for-mm.patch: Add i386 hotplug patch from: ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.11-rc5/2.6.11-rc5-mm1/broken-out/i386-cpu-hotplug-updated-for-mm.patch i386-cpu-hotplug-updated-for-mm.patch, Makefile: new file traps.c, smpboot.c, smp.c, process.c, irq.c, Kconfig: Merge changes from hotplug patch. --- .rootkeys | 2 + linux-2.6.11-xen-sparse/arch/xen/i386/Kconfig | 10 + .../arch/xen/i386/Makefile | 1 + .../arch/xen/i386/kernel/irq.c | 65 +- .../arch/xen/i386/kernel/process.c | 34 + .../arch/xen/i386/kernel/smp.c | 24 +- .../arch/xen/i386/kernel/smpboot.c | 101 ++- .../arch/xen/i386/kernel/traps.c | 8 + .../arch/xen/i386/mach-default/Makefile | 12 + .../i386-cpu-hotplug-updated-for-mm.patch | 656 ++++++++++++++++++ 10 files changed, 887 insertions(+), 26 deletions(-) create mode 100644 linux-2.6.11-xen-sparse/arch/xen/i386/mach-default/Makefile create mode 100644 patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch diff --git a/.rootkeys b/.rootkeys index cd12228470..ca9b9fa559 100644 --- a/.rootkeys +++ b/.rootkeys @@ -244,6 +244,7 @@ 40f562389xNa78YBZciUibQjyRU_Lg linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c 40f56238JypKAUG01ZojFwH7qnZ5uA linux-2.6.11-xen-sparse/arch/xen/i386/kernel/vsyscall.S 40f56238wi6AdNQjm0RT57bSkwb6hg linux-2.6.11-xen-sparse/arch/xen/i386/kernel/vsyscall.lds +427245dboQBkhq841wIPqlRD-AG9Jw linux-2.6.11-xen-sparse/arch/xen/i386/mach-default/Makefile 40f56238a3w6-byOzexIlMgni76Lcg linux-2.6.11-xen-sparse/arch/xen/i386/mm/Makefile 40f56238ILx8xlbywNbzTdv5Zr4xXQ linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c 4118cc35CbY8rfGVspF5O-7EkXBEAA linux-2.6.11-xen-sparse/arch/xen/i386/mm/highmem.c @@ -461,6 +462,7 @@ 422e4430-gOD358H8nGGnNWes08Nng netbsd-2.0-xen-sparse/sys/miscfs/kernfs/kernfs_vnops.c 413cb3b53nyOv1OIeDSsCXhBFDXvJA netbsd-2.0-xen-sparse/sys/nfs/files.nfs 413aa1d0oNP8HXLvfPuMe6cSroUfSA patches/linux-2.6.11/agpgart.patch +427261074Iy1MkbbqIV6zdZDWWx_Jg patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch 42372652KCUP-IOH9RN19YQmGhs4aA patches/linux-2.6.11/iomap.patch 424f001e_M1Tnxc52rDrmCLelnDWMQ patches/linux-2.6.11/x86_64-linux.patch 3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/Kconfig b/linux-2.6.11-xen-sparse/arch/xen/i386/Kconfig index a6416ed5ed..0fad9a4f33 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/Kconfig +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/Kconfig @@ -668,6 +668,16 @@ config X86_LOCAL_APIC depends on (X86_VISWS || SMP) && !X86_VOYAGER default n +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" + depends on SMP && HOTPLUG && EXPERIMENTAL + ---help--- + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu. + + Say N. + + if XEN_PHYSDEV_ACCESS menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/Makefile b/linux-2.6.11-xen-sparse/arch/xen/i386/Makefile index 8af1059853..053c0984ac 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/Makefile +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/Makefile @@ -72,6 +72,7 @@ head-y := arch/xen/i386/kernel/head.o arch/xen/i386/kernel/init_task.o libs-y += arch/i386/lib/ core-y += arch/xen/i386/kernel/ \ arch/xen/i386/mm/ \ + arch/xen/i386/mach-default/ \ arch/i386/crypto/ # \ # arch/xen/$(mcore-y)/ diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/irq.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/irq.c index 6cd16ccfdc..f31697ecb1 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/irq.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/irq.c @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include #ifndef CONFIG_X86_LOCAL_APIC /* @@ -207,9 +210,8 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); - for (j=0; jtypename); seq_printf(p, " %s", action->name); @@ -237,16 +238,13 @@ skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", nmi_count(j)); + for_each_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); seq_putc(p, '\n'); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", - irq_stat[j].apic_timer_irqs); + for_each_cpu(j) + seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs); seq_putc(p, '\n'); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); @@ -256,3 +254,44 @@ skip: } return 0; } + +#ifdef CONFIG_HOTPLUG_CPU + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c index 0f9a2eed70..9061053081 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c @@ -13,6 +13,7 @@ #include +#include #include #include #include @@ -54,6 +55,9 @@ #include #include +#include +#include + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int hlt_counter; @@ -112,6 +116,33 @@ void xen_idle(void) } } +#ifdef CONFIG_HOTPLUG_CPU +#include +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* We shouldn't have to disable interrupts while dead, but + * some interrupts just don't seem to go away, and this makes + * it "work" for testing purposes. */ + /* Death loop */ + while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) + HYPERVISOR_yield(); + + local_irq_disable(); + __flush_tlb_all(); + cpu_set(smp_processor_id(), cpu_online_map); + local_irq_enable(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a @@ -130,6 +161,9 @@ void cpu_idle (void) cpu_clear(cpu, cpu_idle_map); rmb(); + if (cpu_is_offline(cpu)) + play_dead(); + irq_stat[cpu].idle_timestamp = jiffies; xen_idle(); } diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smp.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smp.c index 915034b36d..fddadbba25 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smp.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smp.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -185,6 +186,7 @@ void send_IPI_mask_bitmask(cpumask_t mask, int vector) unsigned int cpu; local_irq_save(flags); + WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]); for (cpu = 0; cpu < NR_CPUS; ++cpu) { if (cpu_isset(cpu, mask)) { @@ -320,21 +322,21 @@ out: static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { - cpumask_t tmp; /* * A couple of (to be removed) sanity checks: * - * - we do not send IPIs to not-yet booted CPUs. * - current CPU must not be in mask * - mask must exist :) */ BUG_ON(cpus_empty(cpumask)); - - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(cpumask, tmp)); BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. @@ -465,6 +467,7 @@ void flush_tlb_all(void) */ void smp_send_reschedule(int cpu) { + WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } @@ -505,10 +508,16 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, */ { struct call_data_struct data; - int cpus = num_online_cpus()-1; + int cpus; + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + cpus = num_online_cpus()-1; - if (!cpus) + if (!cpus) { + spin_unlock(&call_lock); return 0; + } /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -520,7 +529,6 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, if (wait) atomic_set(&data.finished, 0); - spin_lock(&call_lock); call_data = &data; mb(); diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smpboot.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smpboot.c index ec1b3b9dea..34570a9398 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smpboot.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smpboot.c @@ -44,6 +44,9 @@ #include #include #include +#include +#include +#include #include #include @@ -93,7 +96,14 @@ extern unsigned char trampoline_data []; extern unsigned char trampoline_end []; static unsigned char *trampoline_base; static int trampoline_exec; +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* State of each CPU. */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; +#endif +#if 0 /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -500,6 +510,7 @@ static int __init start_secondary(void *unused) } } cpu_idle(); + return 0; } /* @@ -1284,6 +1295,9 @@ static void __init smp_boot_cpus(unsigned int max_cpus) who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) { + smp_commenced_mask = cpumask_of_cpu(0); + cpu_callin_map = cpumask_of_cpu(0); + mb(); smp_boot_cpus(max_cpus); } @@ -1293,20 +1307,97 @@ void __devinit smp_prepare_boot_cpu(void) cpu_set(smp_processor_id(), cpu_callout_map); } -int __devinit __cpu_up(unsigned int cpu) +#ifdef CONFIG_HOTPLUG_CPU + +/* must be called with the cpucontrol mutex held */ +static int __devinit cpu_enable(unsigned int cpu) { - /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); - return -ENOSYS; + /* get the target out of its holding state */ + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + wmb(); + + /* wait for the processor to ack it. timeout? */ + while (!cpu_online(cpu)) + cpu_relax(); + + fixup_irqs(cpu_online_map); + /* counter the disable in fixup_irqs() */ + local_irq_enable(); + return 0; +} + +int __cpu_disable(void) +{ + cpumask_t map = cpu_online_map; + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + /* Allow any queued timer interrupts to get serviced */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); + + cpu_clear(cpu, map); + fixup_irqs(map); + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) + return; + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} +#else /* ... !CONFIG_HOTPLUG_CPU */ +int __cpu_disable(void) +{ + return -ENOSYS; +} +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __devinit __cpu_up(unsigned int cpu) +{ /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { + printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); local_irq_enable(); return -EIO; } +#ifdef CONFIG_HOTPLUG_CPU + /* Already up, and in cpu_quiescent now? */ + if (cpu_isset(cpu, smp_commenced_mask)) { + cpu_enable(cpu); + return 0; + } +#endif + local_irq_enable(); /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c index a6615b7e18..d0718b5299 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c @@ -609,6 +609,14 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) nmi_enter(); cpu = smp_processor_id(); + +#ifdef CONFIG_HOTPLUG_CPU + if (!cpu_online(cpu)) { + nmi_exit(); + return; + } +#endif + ++nmi_count(cpu); if (!nmi_callback(regs, cpu)) diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mach-default/Makefile b/linux-2.6.11-xen-sparse/arch/xen/i386/mach-default/Makefile new file mode 100644 index 0000000000..7d50b2926e --- /dev/null +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mach-default/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the linux kernel. +# + +c-obj-y := topology.o + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y)): + @ln -fsn $(srctree)/arch/i386/mach-default/$(notdir $@) $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-)) diff --git a/patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch b/patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch new file mode 100644 index 0000000000..ec39143743 --- /dev/null +++ b/patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch @@ -0,0 +1,656 @@ + +From: Zwane Mwaikambo + +Find attached the i386 cpu hotplug patch updated for Ingo's latest round of +goodies. In order to avoid dumping cpu hotplug code into kernel/irq/* i +dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The +difference being that on cpu offline, fixup_irqs() is called before we +clear the cpu from cpu_online_map and a long delay in order to ensure that +we never have any queued external interrupts on the APICs. Due to my usual +test victims being in boxes a continent away this hasn't been tested, but +i'll cover bug reports (nudge, Nathan! ;) + +1) Add CONFIG_HOTPLUG_CPU +2) disable local APIC timer on dead cpus. +3) Disable preempt around irq balancing to prevent CPUs going down. +4) Print irq stats for all possible cpus. +5) Debugging check for interrupts on offline cpus. +6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. +7) play_dead() for offline cpus to spin inside. +8) Handle offline cpus set in flush_tlb_others(). +9) Grab lock earlier in smp_call_function() to prevent CPUs going down. +10) Implement __cpu_disable() and __cpu_die(). +11) Enable local interrupts in cpu_enable() after fixup_irqs() +12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. +13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. + +Signed-off-by: Zwane Mwaikambo +DESC +ppc64: fix hotplug cpu +EDESC +From: Zwane Mwaikambo + +I seem to have broken this when I moved the clearing of the dying cpu to +arch specific code. + +Signed-off-by: Zwane Mwaikambo +Signed-off-by: Andrew Morton +--- + + 25-akpm/arch/i386/Kconfig | 9 ++ + 25-akpm/arch/i386/kernel/apic.c | 3 + 25-akpm/arch/i386/kernel/io_apic.c | 2 + 25-akpm/arch/i386/kernel/irq.c | 66 +++++++++++++++++---- + 25-akpm/arch/i386/kernel/msr.c | 2 + 25-akpm/arch/i386/kernel/process.c | 35 +++++++++++ + 25-akpm/arch/i386/kernel/smp.c | 25 +++++--- + 25-akpm/arch/i386/kernel/smpboot.c | 98 ++++++++++++++++++++++++++++++-- + 25-akpm/arch/i386/kernel/traps.c | 8 ++ + 25-akpm/arch/ia64/kernel/smpboot.c | 3 + 25-akpm/arch/ppc64/kernel/pSeries_smp.c | 5 + + 25-akpm/arch/s390/kernel/smp.c | 4 - + 25-akpm/include/asm-i386/cpu.h | 2 + 25-akpm/include/asm-i386/irq.h | 4 + + 25-akpm/include/asm-i386/smp.h | 3 + 25-akpm/kernel/cpu.c | 14 +--- + arch/ppc64/kernel/smp.c | 0 + 17 files changed, 242 insertions(+), 41 deletions(-) + +diff -puN arch/i386/Kconfig~i386-cpu-hotplug-updated-for-mm arch/i386/Kconfig +--- 25/arch/i386/Kconfig~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/Kconfig 2005-02-23 02:20:06.000000000 -0800 +@@ -1205,6 +1205,15 @@ config SCx200 + This support is also available as a module. If compiled as a + module, it will be called scx200. + ++config HOTPLUG_CPU ++ bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" ++ depends on SMP && HOTPLUG && EXPERIMENTAL ++ ---help--- ++ Say Y here to experiment with turning CPUs off and on. CPUs ++ can be controlled through /sys/devices/system/cpu. ++ ++ Say N. ++ + source "drivers/pcmcia/Kconfig" + + source "drivers/pci/hotplug/Kconfig" +diff -puN arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/apic.c +--- 25/arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/apic.c 2005-02-23 02:20:06.000000000 -0800 +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1048,7 +1049,7 @@ void __init setup_secondary_APIC_clock(v + setup_APIC_timer(calibration_result); + } + +-void __init disable_APIC_timer(void) ++void __devinit disable_APIC_timer(void) + { + if (using_apic_timer) { + unsigned long v; +diff -puN arch/i386/kernel/io_apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/io_apic.c +--- 25/arch/i386/kernel/io_apic.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/io_apic.c 2005-02-23 02:20:06.000000000 -0800 +@@ -576,9 +576,11 @@ static int balanced_irq(void *unused) + try_to_freeze(PF_FREEZE); + if (time_after(jiffies, + prev_balance_time+balanced_irq_interval)) { ++ preempt_disable(); + do_irq_balance(); + prev_balance_time = jiffies; + time_remaining = balanced_irq_interval; ++ preempt_enable(); + } + } + return 0; +diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/irq.c +--- 25/arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/irq.c 2005-02-23 02:20:06.000000000 -0800 +@@ -15,6 +15,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #ifndef CONFIG_X86_LOCAL_APIC + /* +@@ -209,9 +212,8 @@ int show_interrupts(struct seq_file *p, + + if (i == 0) { + seq_printf(p, " "); +- for (j=0; jtypename); + seq_printf(p, " %s", action->name); +@@ -239,16 +240,13 @@ skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); +- for (j = 0; j < NR_CPUS; j++) +- if (cpu_online(j)) +- seq_printf(p, "%10u ", nmi_count(j)); ++ for_each_cpu(j) ++ seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); + #ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); +- for (j = 0; j < NR_CPUS; j++) +- if (cpu_online(j)) +- seq_printf(p, "%10u ", +- irq_stat[j].apic_timer_irqs); ++ for_each_cpu(j) ++ seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs); + seq_putc(p, '\n'); + #endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +@@ -258,3 +256,45 @@ skip: + } + return 0; + } ++ ++#ifdef CONFIG_HOTPLUG_CPU ++#include ++ ++void fixup_irqs(cpumask_t map) ++{ ++ unsigned int irq; ++ static int warned; ++ ++ for (irq = 0; irq < NR_IRQS; irq++) { ++ cpumask_t mask; ++ if (irq == 2) ++ continue; ++ ++ cpus_and(mask, irq_affinity[irq], map); ++ if (any_online_cpu(mask) == NR_CPUS) { ++ printk("Breaking affinity for irq %i\n", irq); ++ mask = map; ++ } ++ if (irq_desc[irq].handler->set_affinity) ++ irq_desc[irq].handler->set_affinity(irq, mask); ++ else if (irq_desc[irq].action && !(warned++)) ++ printk("Cannot set affinity for irq %i\n", irq); ++ } ++ ++#if 0 ++ barrier(); ++ /* Ingo Molnar says: "after the IO-APIC masks have been redirected ++ [note the nop - the interrupt-enable boundary on x86 is two ++ instructions from sti] - to flush out pending hardirqs and ++ IPIs. After this point nothing is supposed to reach this CPU." */ ++ __asm__ __volatile__("sti; nop; cli"); ++ barrier(); ++#else ++ /* That doesn't seem sufficient. Give it 1ms. */ ++ local_irq_enable(); ++ mdelay(1); ++ local_irq_disable(); ++#endif ++} ++#endif ++ +diff -puN arch/i386/kernel/msr.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/msr.c +--- 25/arch/i386/kernel/msr.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/msr.c 2005-02-23 02:20:06.000000000 -0800 +@@ -260,7 +260,7 @@ static struct file_operations msr_fops = + .open = msr_open, + }; + +-static int msr_class_simple_device_add(int i) ++static int __devinit msr_class_simple_device_add(int i) + { + int err = 0; + struct class_device *class_err; +diff -puN arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/process.c +--- 25/arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/process.c 2005-02-23 02:20:06.000000000 -0800 +@@ -13,6 +13,7 @@ + + #include + ++#include + #include + #include + #include +@@ -55,6 +56,9 @@ + #include + #include + ++#include ++#include ++ + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); + + int hlt_counter; +@@ -139,6 +143,34 @@ static void poll_idle (void) + } + } + ++#ifdef CONFIG_HOTPLUG_CPU ++#include ++/* We don't actually take CPU down, just spin without interrupts. */ ++static inline void play_dead(void) ++{ ++ /* Ack it */ ++ __get_cpu_var(cpu_state) = CPU_DEAD; ++ ++ /* We shouldn't have to disable interrupts while dead, but ++ * some interrupts just don't seem to go away, and this makes ++ * it "work" for testing purposes. */ ++ /* Death loop */ ++ while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) ++ cpu_relax(); ++ ++ local_irq_disable(); ++ __flush_tlb_all(); ++ cpu_set(smp_processor_id(), cpu_online_map); ++ enable_APIC_timer(); ++ local_irq_enable(); ++} ++#else ++static inline void play_dead(void) ++{ ++ BUG(); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ + /* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a +@@ -162,6 +194,9 @@ void cpu_idle (void) + if (!idle) + idle = default_idle; + ++ if (cpu_is_offline(cpu)) ++ play_dead(); ++ + irq_stat[cpu].idle_timestamp = jiffies; + idle(); + } +diff -puN arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/smpboot.c +--- 25/arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/smpboot.c 2005-02-23 02:20:06.000000000 -0800 +@@ -44,6 +44,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include + #include +@@ -89,6 +92,9 @@ extern unsigned char trampoline_end []; + static unsigned char *trampoline_base; + static int trampoline_exec; + ++/* State of each CPU. */ ++DEFINE_PER_CPU(int, cpu_state) = { 0 }; ++ + /* + * Currently trivial. Write the real->protected mode + * bootstrap into the page concerned. The caller +@@ -1095,6 +1101,9 @@ static void __init smp_boot_cpus(unsigne + who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ + void __init smp_prepare_cpus(unsigned int max_cpus) + { ++ smp_commenced_mask = cpumask_of_cpu(0); ++ cpu_callin_map = cpumask_of_cpu(0); ++ mb(); + smp_boot_cpus(max_cpus); + } + +@@ -1104,20 +1113,99 @@ void __devinit smp_prepare_boot_cpu(void + cpu_set(smp_processor_id(), cpu_callout_map); + } + +-int __devinit __cpu_up(unsigned int cpu) ++#ifdef CONFIG_HOTPLUG_CPU ++ ++/* must be called with the cpucontrol mutex held */ ++static int __devinit cpu_enable(unsigned int cpu) + { +- /* This only works at boot for x86. See "rewrite" above. */ +- if (cpu_isset(cpu, smp_commenced_mask)) { +- local_irq_enable(); +- return -ENOSYS; ++ /* get the target out of its holding state */ ++ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; ++ wmb(); ++ ++ /* wait for the processor to ack it. timeout? */ ++ while (!cpu_online(cpu)) ++ cpu_relax(); ++ ++ fixup_irqs(cpu_online_map); ++ /* counter the disable in fixup_irqs() */ ++ local_irq_enable(); ++ return 0; ++} ++ ++int __cpu_disable(void) ++{ ++ cpumask_t map = cpu_online_map; ++ int cpu = smp_processor_id(); ++ ++ /* ++ * Perhaps use cpufreq to drop frequency, but that could go ++ * into generic code. ++ * ++ * We won't take down the boot processor on i386 due to some ++ * interrupts only being able to be serviced by the BSP. ++ * Especially so if we're not using an IOAPIC -zwane ++ */ ++ if (cpu == 0) ++ return -EBUSY; ++ ++ /* We enable the timer again on the exit path of the death loop */ ++ disable_APIC_timer(); ++ /* Allow any queued timer interrupts to get serviced */ ++ local_irq_enable(); ++ mdelay(1); ++ local_irq_disable(); ++ ++ cpu_clear(cpu, map); ++ fixup_irqs(map); ++ /* It's now safe to remove this processor from the online map */ ++ cpu_clear(cpu, cpu_online_map); ++ return 0; ++} ++ ++void __cpu_die(unsigned int cpu) ++{ ++ /* We don't do anything here: idle task is faking death itself. */ ++ unsigned int i; ++ ++ for (i = 0; i < 10; i++) { ++ /* They ack this in play_dead by setting CPU_DEAD */ ++ if (per_cpu(cpu_state, cpu) == CPU_DEAD) ++ return; ++ current->state = TASK_UNINTERRUPTIBLE; ++ schedule_timeout(HZ/10); + } ++ printk(KERN_ERR "CPU %u didn't die...\n", cpu); ++} ++#else /* ... !CONFIG_HOTPLUG_CPU */ ++int __cpu_disable(void) ++{ ++ return -ENOSYS; ++} + ++void __cpu_die(unsigned int cpu) ++{ ++ /* We said "no" in __cpu_disable */ ++ BUG(); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __devinit __cpu_up(unsigned int cpu) ++{ + /* In case one didn't come up */ + if (!cpu_isset(cpu, cpu_callin_map)) { ++ printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); + local_irq_enable(); + return -EIO; + } + ++#ifdef CONFIG_HOTPLUG_CPU ++ /* Already up, and in cpu_quiescent now? */ ++ if (cpu_isset(cpu, smp_commenced_mask)) { ++ cpu_enable(cpu); ++ return 0; ++ } ++#endif ++ + local_irq_enable(); + /* Unleash the CPU! */ + cpu_set(cpu, smp_commenced_mask); +diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/smp.c +--- 25/arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/smp.c 2005-02-23 02:20:06.000000000 -0800 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -163,7 +164,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu + unsigned long flags; + + local_irq_save(flags); +- ++ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + /* + * Wait for idle. + */ +@@ -345,21 +346,21 @@ out: + static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, + unsigned long va) + { +- cpumask_t tmp; + /* + * A couple of (to be removed) sanity checks: + * +- * - we do not send IPIs to not-yet booted CPUs. + * - current CPU must not be in mask + * - mask must exist :) + */ + BUG_ON(cpus_empty(cpumask)); +- +- cpus_and(tmp, cpumask, cpu_online_map); +- BUG_ON(!cpus_equal(cpumask, tmp)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); + ++ /* If a CPU which we ran on has gone down, OK. */ ++ cpus_and(cpumask, cpumask, cpu_online_map); ++ if (cpus_empty(cpumask)) ++ return; ++ + /* + * i'm not happy about this global shared spinlock in the + * MM hot path, but we'll see how contended it is. +@@ -484,6 +485,7 @@ void smp_send_nmi_allbutself(void) + */ + void smp_send_reschedule(int cpu) + { ++ WARN_ON(cpu_is_offline(cpu)); + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); + } + +@@ -524,10 +526,16 @@ int smp_call_function (void (*func) (voi + */ + { + struct call_data_struct data; +- int cpus = num_online_cpus()-1; ++ int cpus; + +- if (!cpus) ++ /* Holding any lock stops cpus from going down. */ ++ spin_lock(&call_lock); ++ cpus = num_online_cpus()-1; ++ ++ if (!cpus) { ++ spin_unlock(&call_lock); + return 0; ++ } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); +@@ -539,7 +547,6 @@ int smp_call_function (void (*func) (voi + if (wait) + atomic_set(&data.finished, 0); + +- spin_lock(&call_lock); + call_data = &data; + mb(); + +diff -puN arch/i386/kernel/traps.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/traps.c +--- 25/arch/i386/kernel/traps.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/i386/kernel/traps.c 2005-02-23 02:20:06.000000000 -0800 +@@ -669,6 +669,14 @@ fastcall void do_nmi(struct pt_regs * re + nmi_enter(); + + cpu = smp_processor_id(); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ if (!cpu_online(cpu)) { ++ nmi_exit(); ++ return; ++ } ++#endif ++ + ++nmi_count(cpu); + + if (!nmi_callback(regs, cpu)) +diff -puN arch/ia64/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/ia64/kernel/smpboot.c +--- 25/arch/ia64/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/ia64/kernel/smpboot.c 2005-02-23 02:20:06.000000000 -0800 +@@ -590,9 +590,10 @@ int __cpu_disable(void) + if (cpu == 0) + return -EBUSY; + ++ cpu_clear(cpu, cpu_online_map); + fixup_irqs(); + local_flush_tlb_all(); +- printk ("Disabled cpu %u\n", smp_processor_id()); ++ printk("Disabled cpu %u\n", cpu); + return 0; + } + +diff -puN arch/ppc64/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/ppc64/kernel/smp.c +diff -puN arch/s390/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/s390/kernel/smp.c +--- 25/arch/s390/kernel/smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/arch/s390/kernel/smp.c 2005-02-23 02:20:06.000000000 -0800 +@@ -679,12 +679,14 @@ __cpu_disable(void) + { + unsigned long flags; + ec_creg_mask_parms cr_parms; ++ int cpu = smp_processor_id(); + + spin_lock_irqsave(&smp_reserve_lock, flags); +- if (smp_cpu_reserved[smp_processor_id()] != 0) { ++ if (smp_cpu_reserved[cpu] != 0) { + spin_unlock_irqrestore(&smp_reserve_lock, flags); + return -EBUSY; + } ++ cpu_clear(cpu, cpu_online_map); + + #ifdef CONFIG_PFAULT + /* Disable pfault pseudo page faults on this cpu. */ +diff -puN include/asm-i386/cpu.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/cpu.h +--- 25/include/asm-i386/cpu.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/include/asm-i386/cpu.h 2005-02-23 02:20:06.000000000 -0800 +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + #include + +@@ -17,4 +18,5 @@ extern int arch_register_cpu(int num); + extern void arch_unregister_cpu(int); + #endif + ++DECLARE_PER_CPU(int, cpu_state); + #endif /* _ASM_I386_CPU_H_ */ +diff -puN include/asm-i386/irq.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/irq.h +--- 25/include/asm-i386/irq.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/include/asm-i386/irq.h 2005-02-23 02:20:06.000000000 -0800 +@@ -38,4 +38,8 @@ extern void release_vm86_irqs(struct tas + extern int irqbalance_disable(char *str); + #endif + ++#ifdef CONFIG_HOTPLUG_CPU ++extern void fixup_irqs(cpumask_t map); ++#endif ++ + #endif /* _ASM_IRQ_H */ +diff -puN include/asm-i386/smp.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/smp.h +--- 25/include/asm-i386/smp.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/include/asm-i386/smp.h 2005-02-23 02:20:06.000000000 -0800 +@@ -85,6 +85,9 @@ static __inline int logical_smp_processo + } + + #endif ++ ++extern int __cpu_disable(void); ++extern void __cpu_die(unsigned int cpu); + #endif /* !__ASSEMBLY__ */ + + #define NO_PROC_ID 0xFF /* No processor magic marker */ +diff -puN kernel/cpu.c~i386-cpu-hotplug-updated-for-mm kernel/cpu.c +--- 25/kernel/cpu.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 ++++ 25-akpm/kernel/cpu.c 2005-02-23 02:20:06.000000000 -0800 +@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused) + { + int err; + +- /* Take offline: makes arch_cpu_down somewhat easier. */ +- cpu_clear(smp_processor_id(), cpu_online_map); +- + /* Ensure this CPU doesn't handle any more interrupts. */ + err = __cpu_disable(); + if (err < 0) +- cpu_set(smp_processor_id(), cpu_online_map); +- else +- /* Force idle task to run as soon as we yield: it should +- immediately notice cpu is offline and die quickly. */ +- sched_idle_next(); ++ return err; + +- return err; ++ /* Force idle task to run as soon as we yield: it should ++ immediately notice cpu is offline and die quickly. */ ++ sched_idle_next(); ++ return 0; + } + + int cpu_down(unsigned int cpu) +diff -puN arch/ppc64/kernel/pSeries_smp.c~i386-cpu-hotplug-updated-for-mm arch/ppc64/kernel/pSeries_smp.c +--- 25/arch/ppc64/kernel/pSeries_smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:08.000000000 -0800 ++++ 25-akpm/arch/ppc64/kernel/pSeries_smp.c 2005-02-23 02:20:08.000000000 -0800 +@@ -86,10 +86,13 @@ static int query_cpu_stopped(unsigned in + + int pSeries_cpu_disable(void) + { ++ int cpu = smp_processor_id(); ++ ++ cpu_clear(cpu, cpu_online_map); + systemcfg->processorCount--; + + /*fix boot_cpuid here*/ +- if (smp_processor_id() == boot_cpuid) ++ if (cpu == boot_cpuid) + boot_cpuid = any_online_cpu(cpu_online_map); + + /* FIXME: abstract this to not be platform specific later on */ +_ -- 2.30.2